import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
import sympy as sy
import numpy as np
sy.init_printing()
=3)
np.set_printoptions(precision=True) np.set_printoptions(suppress
def round_expr(expr, num_digits):
return expr.xreplace({n: round(n, num_digits) for n in expr.atoms(sy.Number)})
The Gram-Schmidt Process is an algorithm of producing an orthogonal or orthonormal basis.
An Example in \(\mathbb{R}^3\)
Let \(W=\operatorname{Span}\left\{\mathbf{x}_{1}, \mathbf{x}_{2}, \mathbf{x}_{3}\right\}\), where \[ \mathbf{x}_{1}=\left[\begin{array}{l} 3 \\ 6 \\ 2 \end{array}\right] \text {, } \mathbf{x}_{2}=\left[\begin{array}{l} 1 \\ 2 \\ 4 \end{array}\right]\text {, and }\mathbf{x}_{3}=\left[\begin{array}{l} 2 \\ -2 \\ 1 \end{array}\right] \]. They are not orthogonal, but we can construct an orthogonal basis \(\{\mathbf{v}_1, \mathbf{v}_2, \mathbf{v}_3\}\) for \(W\) based on \(\left\{\mathbf{x}_{1}, \mathbf{x}_{2}, \mathbf{x}_{3}\right\}\). We will visualize the process.
First we plot the \(W=\operatorname{Span}\left\{\mathbf{x}_{1}, \mathbf{x}_{2},\mathbf{x}_{3}\right\}\).
######################## Subspace W ##############################
= np.linspace(-1, 1, 10)
s = np.linspace(-1, 1, 10)
t = np.meshgrid(s, t)
S, T
= np.array([[[0, 0, 0, 3, 6, 2]], [[0, 0, 0, 1, 2, 4]], [[0, 0, 0, 2, -2, 1]]])
vec
= vec[0, :, 3] * S + vec[1, :, 3] * T
X = vec[0, :, 4] * S + vec[1, :, 4] * T
Y = vec[0, :, 5] * S + vec[1, :, 5] * T
Z
= plt.figure(figsize=(7, 7))
fig = fig.add_subplot(projection="3d")
ax =1.5, alpha=0.3)
ax.plot_wireframe(X, Y, Z, linewidth
############################# x1 and x2 ##############################
= ["r", "b", "g"]
colors = ["$x_1$", "$x_2$", "$x_3$"]
s for i in range(vec.shape[0]):
= zip(*vec[i, :, :])
X, Y, Z, U, V, W
ax.quiver(
X,
Y,
Z,
U,
V,
W,=1,
length=False,
normalize=colors[i],
color=0.6,
alpha=0.08,
arrow_length_ratio="tail",
pivot="solid",
linestyles=3,
linewidths
)3][0], vec[i, :, 4][0], vec[i, :, 5][0], s=s[i], size=15)
ax.text(vec[i, :,
"x-axis")
ax.set_xlabel("y-axis")
ax.set_ylabel("z-axis")
ax.set_zlabel( plt.show()
If we choose \(\mathbf{v}_1= \mathbf{x}_1\), then the orthogonal component of projection of \(\mathbf{x}_2\) onto \(\mathbf{v}_1\) is \(\mathbf{v}_2\).
Define \(\text{Proj}_{\mathbf{v}_1}\mathbf{x}_2 = \alpha \mathbf{x}_1\), then \((\mathbf{x}_2 - \alpha \mathbf{x}_1)\cdot \mathbf{x}_1 = 0\), rearange for \(\alpha\)
\[ \alpha = \frac{\mathbf{x}_2^T\mathbf{x}_1}{\mathbf{x}_1^T\mathbf{x}_1} \]
According to definition above
\[ \text{Proj}_{\mathbf{v}_1}\mathbf{x}_2 = \alpha \mathbf{x}_1 = \frac{\mathbf{x}_2^T\mathbf{x}_1}{\mathbf{x}_1^T\mathbf{x}_1}\mathbf{x}_1 \]
The orthogonal component, \(\mathbf{v}_2\) is
\[ \mathbf{x}_2- \text{Proj}_{\mathbf{v}_1}\mathbf{x}_2 =\mathbf{x}_2 - \frac{\mathbf{x}_2^T\mathbf{x}_1}{\mathbf{x}_1^T\mathbf{x}_1}\mathbf{x}_1 \]
= np.array([1, 2, 4])
x2 = np.array([3, 6, 2])
x1 = x2 - (x2 @ x1) / (x1 @ x1) * x1
v2 v2
array([-0.408, -0.816, 3.061])
######################## Subspace W ##############################
= np.linspace(-1, 1, 10)
s = np.linspace(-1, 1, 10)
t = np.meshgrid(s, t)
S, T
= np.array([3, 6, 2])
x1 = np.array([1, 2, 4])
x2 = np.array([2, -2, 1])
x3 = x1
v1 = x2
v2
= x1[0] * S + x2[0] * T
X = x1[1] * S + x2[1] * T
Y = x1[2] * S + x2[2] * T
Z
= plt.figure(figsize=(7, 7))
fig = fig.add_subplot(projection="3d")
ax =1.5, alpha=0.3)
ax.plot_wireframe(X, Y, Z, linewidth
############################# x1, x2, v2, alpha*v1 ##############################
= np.array([[0, 0, 0, x1[0], x1[1], x1[2]]])
vec = zip(*vec)
X, Y, Z, U, V, W
ax.quiver(
X,
Y,
Z,
U,
V,
W,=1,
length=False,
normalize="red",
color=0.6,
alpha=0.08,
arrow_length_ratio="tail",
pivot="solid",
linestyles=3,
linewidths
)
= np.array([[0, 0, 0, x2[0], x2[1], x2[2]]])
vec = zip(*vec)
X, Y, Z, U, V, W
ax.quiver(
X,
Y,
Z,
U,
V,
W,=1,
length=False,
normalize="blue",
color=0.6,
alpha=0.08,
arrow_length_ratio="tail",
pivot="solid",
linestyles=3,
linewidths
)
= np.array([[0, 0, 0, x3[0], x3[1], x3[2]]])
vec = zip(*vec)
X, Y, Z, U, V, W
ax.quiver(
X,
Y,
Z,
U,
V,
W,=1,
length=False,
normalize="green",
color=0.6,
alpha=0.08,
arrow_length_ratio="tail",
pivot="solid",
linestyles=3,
linewidths
)
= (x2 @ x1) / (x1 @ x1)
alpha
= np.array([[0, 0, 0, alpha * x1[0], alpha * x1[1], alpha * x1[2]]])
vec = zip(*vec)
X, Y, Z, U, V, W
ax.quiver(
X,
Y,
Z,
U,
V,
W,=1,
length=False,
normalize="purple",
color=0.6,
alpha=0.12,
arrow_length_ratio="tail",
pivot="solid",
linestyles=3,
linewidths
)
0], x1[1], x1[2], r"$\mathbf{x}_1 = \mathbf{v}_1 $", size=15)
ax.text(x1[0], x2[1], x2[2], r"$\mathbf{x}_2$", size=15)
ax.text(x2[0], x3[1], x3[2], r"$\mathbf{x}_3$", size=15)
ax.text(x3[* x1[0], alpha * x1[1], alpha * x1[2], r"$\mathbf{\hat{x}}_2$", size=15)
ax.text(alpha
"x-axis")
ax.set_xlabel("y-axis")
ax.set_ylabel("z-axis")
ax.set_zlabel(
################################# Dashed Line ##################################
= [alpha * x1[0], alpha * x1[1], alpha * x1[2]]
point1 = [x2[0], x2[1], x2[2]]
point2 = np.array([point1, point2])
line1 0], line1[:, 1], line1[:, 2], c="b", lw=3.5, alpha=0.5, ls="--")
ax.plot(line1[:,
= [x2[0], x2[1], x2[2]]
point1 = [v2[0], v2[1], v2[2]]
point2 = np.array([point1, point2])
line2 0], line2[:, 1], line2[:, 2], c="b", lw=3.5, alpha=0.5, ls="--")
ax.plot(line2[:,
plt.show()
Next step, we find \(\mathbf{v}_3\), define \(W = \text{Span}\{\mathbf{v}_1, \mathbf{v}_2\}\)
\[ \mathbf{x}_3- \text{Proj}_{W}\mathbf{x}_3 =\mathbf{x}_3 - \frac{\mathbf{x}_3^T\mathbf{v}_1}{\mathbf{v}_1^T\mathbf{v}_1}\mathbf{v}_1-\frac{\mathbf{x}_3^T\mathbf{v}_2}{\mathbf{v}_2^T\mathbf{v}_2}\mathbf{v}_2 \]
Again, the codes are superfluous, yet exceedingly intuitive.
= np.array([2, -2, 1])
x3 = (x3 @ v1) / (v1 @ v1) * v1 + (x3 @ v2) / (v2 @ v2) * v2
projW_x3 = x3 - projW_x3
v3 v3
array([ 2.15 , -1.701, 0.782])
######################## Subspace W ##############################
= np.linspace(-1, 1, 10)
s = np.linspace(-1, 1, 10)
t = np.meshgrid(s, t)
S, T
= np.array([3, 6, 2])
x1 = np.array([1, 2, 4])
x2 = np.array([2, -2, 1])
x3
= x1[0] * S + x2[0] * T
X = x1[1] * S + x2[1] * T
Y = x1[2] * S + x2[2] * T
Z
= plt.figure(figsize=(9, 9))
fig = fig.add_subplot(projection="3d")
ax =1.5, alpha=0.3)
ax.plot_wireframe(X, Y, Z, linewidth
############################# x1, x2, x3, alpha*x1 ##############################
= np.array([[0, 0, 0, x1[0], x1[1], x1[2]]])
vec = zip(*vec)
X, Y, Z, U, V, W
ax.quiver(
X,
Y,
Z,
U,
V,
W,=1,
length=False,
normalize="red",
color=0.6,
alpha=0.08,
arrow_length_ratio="tail",
pivot="solid",
linestyles=3,
linewidths
)
= np.array([[0, 0, 0, x2[0], x2[1], x2[2]]])
vec = zip(*vec)
X, Y, Z, U, V, W
ax.quiver(
X,
Y,
Z,
U,
V,
W,=1,
length=False,
normalize="blue",
color=0.6,
alpha=0.08,
arrow_length_ratio="tail",
pivot="solid",
linestyles=3,
linewidths
)
= np.array([[0, 0, 0, x3[0], x3[1], x3[2]]])
vec = zip(*vec)
X, Y, Z, U, V, W
ax.quiver(
X,
Y,
Z,
U,
V,
W,=1,
length=False,
normalize="green",
color=0.6,
alpha=0.08,
arrow_length_ratio="tail",
pivot="solid",
linestyles=3,
linewidths
)
= (x2 @ x1) / (x1 @ x1)
alpha = alpha * x1
projW_x2
= np.array([[0, 0, 0, projW_x2[0], projW_x2[1], projW_x2[2]]])
vec = zip(*vec)
X, Y, Z, U, V, W
ax.quiver(
X,
Y,
Z,
U,
V,
W,=1,
length=False,
normalize="purple",
color=0.6,
alpha=0.12,
arrow_length_ratio="tail",
pivot="solid",
linestyles=3,
linewidths
)
0], x1[1], x1[2], r"$\mathbf{x}_1 = \mathbf{v}_1$", size=15)
ax.text(x1[0], x2[1], x2[2], r"$\mathbf{x}_2$", size=15)
ax.text(x2[0], x3[1], x3[2], r"$\mathbf{x}_3$", size=15)
ax.text(x3[0], projW_x2[1], projW_x2[2], r"$\mathbf{\hat{x}}_2$", size=15)
ax.text(projW_x2[
"x-axis")
ax.set_xlabel("y-axis")
ax.set_ylabel("z-axis")
ax.set_zlabel(
################################# Dashed Line ##################################
= projW_x2
point1 = x2
point2 = np.array([point1, point2])
line1 0], line1[:, 1], line1[:, 2], c="b", lw=3.5, alpha=0.5, ls="--")
ax.plot(line1[:,
= x2
point1 = projW_x2
point2 = np.array([point1, point2])
line2 0], line2[:, 1], line2[:, 2], c="b", lw=3.5, alpha=0.5, ls="--")
ax.plot(line2[:,
################################ Axes ######################################
-5, 5)
ax.set_xlim3d(-5, 5)
ax.set_ylim3d(-5, 5)
ax.set_zlim3d(
plt.show()
Now we have orthogonal basis \(\{\mathbf{v}_1, \mathbf{v}_2, \mathbf{v}_3\}\), and futher we can normalize them.The column of \(U\) is a set of orthonormal basis.
= x1
v1 = v1 / sp.linalg.norm(v1)
u1 = v2 / sp.linalg.norm(v2)
u2 = v3 / sp.linalg.norm(v3)
u3
= np.vstack((u1, u2, u3)).T
U U
array([[ 0.429, 0.218, 0.754],
[ 0.857, 0.436, -0.597],
[ 0.286, 0.873, 0.274]])
@ U U.T
array([[ 1. , 0.717, -0.11 ],
[ 0.717, 1. , 0.144],
[-0.11 , 0.144, 1. ]])
We can also use SymPy built-in algorithm orthogonalize
or GramSchmidt
, for Gram-Schmidt process.
SymPy Functions for Gram-Schimidt Process
We need to prepare all the vectors in a form
\[ L = [\mathbf v_1,\ \mathbf v_2,\ ...,\ \mathbf v_n] \]
where \(\mathbf v_i, i\in (1,2,...n)\) is a column vector.
# Define the vectors
= [3, 6, 2]
x1 = [1, 2, 4]
x2 = [2, -2, 1]
x3
# Create the list of matrices
= [sy.Matrix(x1).T, sy.Matrix(x2).T, sy.Matrix(x3).T]
L
# Perform Gram-Schmidt process
= sy.GramSchmidt(L)
ort = sy.GramSchmidt(L, orthonormal=True)
ort_norm
# Display the results
print("Orthogonal basis:")
for vec in ort:
print(vec)
print("\nOrthonormal basis:")
for vec in ort_norm:
print(vec)
Orthogonal basis:
Matrix([[3, 6, 2]])
Matrix([[-20/49, -40/49, 150/49]])
Matrix([[12/5, -6/5, 0]])
Orthonormal basis:
Matrix([[3/7, 6/7, 2/7]])
Matrix([[-2*sqrt(5)/35, -4*sqrt(5)/35, 3*sqrt(5)/7]])
Matrix([[2*sqrt(5)/5, -sqrt(5)/5, 0]])
The QR Decomposition
QR decomposition is commonly used for solving linear systems and is also very popular for least squares solutions. QR decomposition is based on the Gram-Schmidt process we just discussed.
Consider two matrices:
\[A=\left[\mathbf{a}_{1}, \ldots, \mathbf{a}_{n}\right] \quad \text{and} \quad Q=\left[\mathbf{u}_{1}, \ldots, \mathbf{u}_{n}\right]\]
where \(Q\) is the orthonormalized version of \(A\). We define \(R\) as \(Q^T A\):
\[R=\left[\begin{array}{ccccc} \mathbf{u}_{1}^T \mathbf{a}_{1} & \mathbf{u}_{1}^T \mathbf{a}_{2} & \mathbf{u}_{1}^T \mathbf{a}_{3} & \dots & \mathbf{u}_{1}^T \mathbf{a}_{n} \\ 0 & \mathbf{u}_{2}^T \mathbf{a}_{2} & \mathbf{u}_{2}^T \mathbf{a}_{3} & \dots & \mathbf{u}_{2}^T \mathbf{a}_{n} \\ 0 & 0 & \mathbf{u}_{3}^T \mathbf{a}_{3} & \dots & \mathbf{u}_{3}^T \mathbf{a}_{n} \\ \vdots & \vdots & \vdots & \ddots & \vdots \\ 0 & 0 & 0 & \dots & \mathbf{u}_{n}^T \mathbf{a}_{n} \end{array}\right]\]
Since \(Q\) is an orthonormal matrix, we have:
\[ A = QR \]
Try not to use \(QR\) decomposition in SymPy directly, here we convert fraction into float with sy.N()
and round it with round_expr
.
def gram_schmidt(X):
"""
Performs QR decomposition of matrix X using the Gram-Schmidt process.
Returns matrices Q and R such that X = QR.
"""
= X.shape
n, m = np.zeros((n, m))
Q = np.zeros((m, m))
R
for j in range(m):
= X[:, j]
v for i in range(j):
= Q[:, i]
q = np.dot(q, v)
R[i, j] = v - R[i, j] * q
v = np.linalg.norm(v)
R[j, j] = v / R[j, j]
Q[:, j]
return Q, R
def solve_linear_regression(X, y):
"""
Solves the linear regression problem using QR decomposition.
"""
= gram_schmidt(X)
Q, R # Compute Q^T y
= np.dot(Q.T, y)
Qt_y # Solve R beta = Q^T y using back substitution
= np.zeros(R.shape[1])
beta for i in reversed(range(R.shape[1])):
= (Qt_y[i] - np.dot(R[i, i + 1 :], beta[i + 1 :])) / R[i, i]
beta[i] return beta
# Example usage
= np.array([[1, 1], [1, 2], [1, 3], [1, 4]])
X = np.array([6, 5, 7, 10])
y
= solve_linear_regression(X, y)
beta print("Coefficients:", beta)
Coefficients: [3.5 1.4]
The Least-Squares Problem
We are not delving deeply into this topic at the moment, as we will revisit it multiple times. For those who have not studied linear regression or econometrics, it suffices to know that least-squares solutions involve finding a coordinate \(\beta\) for the basis of \(\text{Col}X\), which forms a linear combination of \(\hat{y}\).
\(\hat{y}\) is the orthogonal projection of \(y\) onto \(\text{Col}X\), denoted as \(\hat{y} = \text{proj}_{\text{Col}X}y\).
The distance between \(y\) and \(\hat{y}\) is the shortest among all possible \(\|y - X\beta \|\) in the vector space, meaning that
\[ \|y - X\hat{\beta}\| \leq \|y - X\beta \| \]
The \(\text{Col}X\) is orthogonal to the component of the orthogonal projection of \(y\), thus
\[\begin{align} X^T(y - X\hat{\beta}) &= 0 \\ X^Ty &= X^TX\hat{\beta} \\ \hat{\beta} &= (X^TX)^{-1}X^Ty \end{align}\]